scRNAseq analysis of murine mammalian tissue

Load the required packages:

library(dplyr)
library(Seurat)
library(patchwork)
library(devtools)

Loading the data

The data were taken from PANGLAODB. It a sample of mammary tissue (mam) from mus musculus that is part of the Tabula Muris project.

load("SRA653146_SRS3044257.sparse.RData")
# retain only the official gene symbol
rownames(sm) <- sapply(strsplit(rownames(sm),"_"), `[`, 1)
#create the Seurat object
mam <- CreateSeuratObject(counts = sm, project = "scMammary", min.cells = 3, min.features = 200)
## Warning: Non-unique features (rownames) present in the input matrix, making
## unique
mam
## An object of class Seurat 
## 21839 features across 3454 samples within 1 assay 
## Active assay: RNA (21839 features, 0 variable features)

Quality control

Is a check of the principal control parameters: number of unique genes detected in each cell (called “Features”), the total number of detected molecules (“Count”) and the percentage of reads that map to the mitochondrial genome.

Check the presence of mithocondrial genes (mt-…) Low-quality / dying cells often exhibit extensive mitochondrial contamination.

grep("^mt-",rownames(mam),value = TRUE)
##  [1] "mt-Atp6" "mt-Co1"  "mt-Co2"  "mt-Co3"  "mt-Cytb" "mt-Nd1"  "mt-Nd2" 
##  [8] "mt-Nd3"  "mt-Nd4"  "mt-Nd4l" "mt-Nd5"  "mt-Nd6"  "mt-Rnr1" "mt-Rnr2"
## [15] "mt-Tc"   "mt-Tn"   "mt-Tp"
# add a colun with the stats
mam[["percent.mt"]] <- PercentageFeatureSet(mam, pattern = "^mt-")

Check the ribosomal protein genes (Rpl/Rps…)

grep("^Rp[ls]",rownames(mam),value = TRUE)
##   [1] "Rpl10"       "Rpl10-ps1"   "Rpl10-ps2"   "Rpl10-ps4"   "Rpl10-ps6"  
##   [6] "Rpl10a"      "Rpl10a-ps1"  "Rpl11"       "Rpl12"       "Rpl12-ps1"  
##  [11] "Rpl12-ps2"   "Rpl13"       "Rpl13-ps1"   "Rpl13-ps5"   "Rpl13-ps6"  
##  [16] "Rpl13a"      "Rpl13a-ps1"  "Rpl14"       "Rpl14-ps1"   "Rpl15"      
##  [21] "Rpl15-ps1"   "Rpl15-ps2"   "Rpl15-ps3"   "Rpl17"       "Rpl17-ps1"  
##  [26] "Rpl17-ps10"  "Rpl17-ps3"   "Rpl17-ps4"   "Rpl17-ps5"   "Rpl17-ps8"  
##  [31] "Rpl17-ps9"   "Rpl18"       "Rpl18-ps1"   "Rpl18-ps2"   "Rpl18a"     
##  [36] "Rpl18a-ps1"  "Rpl19"       "Rpl19-ps1"   "Rpl19-ps11"  "Rpl19-ps12" 
##  [41] "Rpl19-ps2"   "Rpl21"       "Rpl21-ps1"   "Rpl21-ps10"  "Rpl21-ps11" 
##  [46] "Rpl21-ps15"  "Rpl21-ps3"   "Rpl21-ps5"   "Rpl21-ps6"   "Rpl21-ps7"  
##  [51] "Rpl21-ps8"   "Rpl21-ps9"   "Rpl22"       "Rpl22-ps1"   "Rpl22l1"    
##  [56] "Rpl23"       "Rpl23a"      "Rpl23a-ps1"  "Rpl23a-ps14" "Rpl23a-ps2" 
##  [61] "Rpl23a-ps3"  "Rpl23a-ps5"  "Rpl24"       "Rpl26"       "Rpl26-ps4"  
##  [66] "Rpl27"       "Rpl27-ps1"   "Rpl27-ps2"   "Rpl27-ps3"   "Rpl27a"     
##  [71] "Rpl27a-ps1"  "Rpl27a-ps2"  "Rpl28"       "Rpl28-ps1"   "Rpl28-ps3"  
##  [76] "Rpl29"       "Rpl29-ps5"   "Rpl3"        "Rpl3-ps1"    "Rpl3-ps2"   
##  [81] "Rpl30"       "Rpl30-ps1"   "Rpl30-ps10"  "Rpl30-ps11"  "Rpl30-ps3"  
##  [86] "Rpl30-ps5"   "Rpl30-ps9"   "Rpl31"       "Rpl31-ps1"   "Rpl31-ps10" 
##  [91] "Rpl31-ps11"  "Rpl31-ps13"  "Rpl31-ps14"  "Rpl31-ps15"  "Rpl31-ps16" 
##  [96] "Rpl31-ps17"  "Rpl31-ps18"  "Rpl31-ps20"  "Rpl31-ps6"   "Rpl31-ps8"  
## [101] "Rpl31-ps9"   "Rpl32"       "Rpl32-ps"    "Rpl34"       "Rpl34-ps1"  
## [106] "Rpl34-ps2"   "Rpl35"       "Rpl35a"      "Rpl35a-ps2"  "Rpl35a-ps3" 
## [111] "Rpl35a-ps4"  "Rpl35a-ps5"  "Rpl35a-ps6"  "Rpl35a-ps7"  "Rpl36"      
## [116] "Rpl36-ps2"   "Rpl36-ps3"   "Rpl36-ps8"   "Rpl36-ps9"   "Rpl36a"     
## [121] "Rpl36a-ps2"  "Rpl36a-ps3"  "Rpl36a-ps5"  "Rpl36al"     "Rpl37"      
## [126] "Rpl37a"      "Rpl37rt"     "Rpl38"       "Rpl38-ps1"   "Rpl38-ps2"  
## [131] "Rpl39"       "Rpl39-ps"    "Rpl39l"      "Rpl4"        "Rpl41"      
## [136] "Rpl5"        "Rpl5-ps1"    "Rpl5-ps2"    "Rpl6"        "Rpl7"       
## [141] "Rpl7-ps7"    "Rpl7a"       "Rpl7a-ps1"   "Rpl7a-ps10"  "Rpl7a-ps11" 
## [146] "Rpl7a-ps12"  "Rpl7a-ps14"  "Rpl7a-ps3"   "Rpl7a-ps5"   "Rpl7a-ps7"  
## [151] "Rpl7a-ps8"   "Rpl7l1"      "Rpl8"        "Rpl9"        "Rpl9-ps4"   
## [156] "Rpl9-ps6"    "Rpl9-ps7"    "Rplp0"       "Rplp1"       "Rplp1-ps1"  
## [161] "Rplp2"       "Rps10"       "Rps10-ps1"   "Rps10-ps2"   "Rps10-ps4"  
## [166] "Rps11"       "Rps11-ps1"   "Rps11-ps2"   "Rps11-ps3"   "Rps11-ps4"  
## [171] "Rps12"       "Rps12-ps1"   "Rps12-ps10"  "Rps12-ps13"  "Rps12-ps19" 
## [176] "Rps12-ps20"  "Rps12-ps24"  "Rps12-ps3"   "Rps12-ps5"   "Rps12-ps9"  
## [181] "Rps13"       "Rps13-ps1"   "Rps13-ps2"   "Rps13-ps4"   "Rps13-ps5"  
## [186] "Rps14"       "Rps15"       "Rps15-ps2"   "Rps15a"      "Rps15a-ps1" 
## [191] "Rps15a-ps2"  "Rps15a-ps3"  "Rps15a-ps4"  "Rps15a-ps5"  "Rps15a-ps6" 
## [196] "Rps15a-ps7"  "Rps15a-ps8"  "Rps16"       "Rps16-ps2"   "Rps17"      
## [201] "Rps18"       "Rps18-ps1"   "Rps19"       "Rps19-ps1"   "Rps19-ps10" 
## [206] "Rps19-ps11"  "Rps19-ps12"  "Rps19-ps13"  "Rps19-ps2"   "Rps19-ps3"  
## [211] "Rps19-ps4"   "Rps19-ps5"   "Rps19-ps6"   "Rps19-ps7"   "Rps19-ps9"  
## [216] "Rps19bp1"    "Rps2"        "Rps2-ps10"   "Rps2-ps11"   "Rps2-ps13"  
## [221] "Rps2-ps2"    "Rps2-ps4"    "Rps2-ps5"    "Rps2-ps9"    "Rps20"      
## [226] "Rps21"       "Rps23"       "Rps23-ps1"   "Rps23-ps2"   "Rps24"      
## [231] "Rps24-ps2"   "Rps24-ps3"   "Rps25"       "Rps25-ps1"   "Rps26"      
## [236] "Rps26-ps1"   "Rps27"       "Rps27-ps1"   "Rps27a"      "Rps27a-ps1" 
## [241] "Rps27a-ps2"  "Rps27l"      "Rps27rt"     "Rps28"       "Rps29"      
## [246] "Rps3"        "Rps3a1"      "Rps3a2"      "Rps3a3"      "Rps4l"      
## [251] "Rps4x"       "Rps4x-ps"    "Rps5"        "Rps6"        "Rps6-ps4"   
## [256] "Rps6ka1"     "Rps6ka2"     "Rps6ka3"     "Rps6ka4"     "Rps6ka5"    
## [261] "Rps6ka6"     "Rps6kb1"     "Rps6kb2"     "Rps6kc1"     "Rps6kl1"    
## [266] "Rps7"        "Rps8"        "Rps8-ps2"    "Rps8-ps4"    "Rps9"       
## [271] "Rpsa"        "Rpsa-ps1"    "Rpsa-ps10"   "Rpsa-ps11"   "Rpsa-ps12"  
## [276] "Rpsa-ps3"    "Rpsa-ps4"    "Rpsa-ps5"    "Rpsa-ps9"
# add a column with the stats
mam[["percent.rbp"]] <- PercentageFeatureSet(mam, pattern = "^Rp[ls]")

Visualize QC metrics

# Feature = # genes
# Count = # detected molecules
VlnPlot(mam, features = c("nFeature_RNA", "nCount_RNA", "percent.mt","percent.rbp"), ncol = 4, cols = "mediumpurple1")

# plot without dots:
VlnPlot(mam, features = c("nFeature_RNA", "nCount_RNA", "percent.mt","percent.rbp"), ncol = 4, pt.size=0, cols = "mediumpurple1")

Then I check if there is a correlation between the different parameters with FeatureScatter plots:

  • Correlation between % of mitochondrial RNA and number of reads

  • Correlation between number of genes and number of reads

  • Correlation between % of rRNA and number of reads

FeatureScatter(mam, feature1 = "nCount_RNA", feature2 = "percent.mt")

FeatureScatter(mam, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")

FeatureScatter(mam, feature1 = "nCount_RNA", feature2 = "percent.rbp")

Low-quality cells or empty droplets will often have very few genes, while cell doublets or multiplets exhibit an aberrantly high gene count. The total number of molecules detected within a cell correlates strongly with unique genes, is the only visible correlation.

QC threshold

From these plots I decide a threshold for the number of genes: between 200 and 4000. The mithocondrial percentage should be lower than 5%.

mam <- subset(mam, subset = nFeature_RNA > 200 & nFeature_RNA < 4000 & percent.mt < 5)
mam
## An object of class Seurat 
## 21839 features across 3134 samples within 1 assay 
## Active assay: RNA (21839 features, 0 variable features)

The remaining samples are 3134 (from 3454 of before)

Data Normalization

10x data are usually just transformed into counts per million (or counts x 10,000 reads) The final “expression estimate” it’s given by the log of the normalized counts:

mam <- NormalizeData(mam, normalization.method = "LogNormalize", scale.factor = 10000)

Cell cycle effect

In order to assign a CC phase to our cells, that come from mouse, I needed to create a list of putative genes suitable for this animal.

convertHumanGeneList <- function(x){
  require("biomaRt")
  human = useMart(biomart="ensembl", dataset = "hsapiens_gene_ensembl", 
                  verbose = TRUE, host = "https://dec2021.archive.ensembl.org")
  mouse = useMart(biomart="ensembl", dataset = "mmusculus_gene_ensembl", 
                  verbose = TRUE, host = "https://dec2021.archive.ensembl.org")
  genes = getLDS(attributes = c("hgnc_symbol"), 
                 filters = "hgnc_symbol", values = x ,
                 mart = human, attributesL = c("mgi_symbol"), 
                 martL = mouse, uniqueRows=T)
  
  humanx <- unique(genes[, 2])
  return(humanx)
}
m.s.genes <- convertHumanGeneList(cc.genes.updated.2019$s.genes)
##    V1
## 1 0.7
## BioMartServer running BioMart version: 0.7
## Mart virtual schema: default
## Mart host: https://dec2021.archive.ensembl.org:443/biomart/martservice
##    V1
## 1 0.7
## BioMartServer running BioMart version: 0.7
## Mart virtual schema: default
## Mart host: https://dec2021.archive.ensembl.org:443/biomart/martservice
m.g2m.genes <- convertHumanGeneList(cc.genes.updated.2019$g2m.genes)
##    V1
## 1 0.7
## BioMartServer running BioMart version: 0.7
## Mart virtual schema: default
## Mart host: https://dec2021.archive.ensembl.org:443/biomart/martservice
##    V1
## 1 0.7
## BioMartServer running BioMart version: 0.7
## Mart virtual schema: default
## Mart host: https://dec2021.archive.ensembl.org:443/biomart/martservice
mam <- CellCycleScoring(mam, s.features = m.s.genes, g2m.features = m.g2m.genes, set.ident = TRUE)

Subset and scaling

I create a subset, keeping only the 2000 most variable genes.

mam <- FindVariableFeatures(mam, selection.method = "vst", nfeatures = 2000)
# top 10 most variable genes:
top10 <- head(VariableFeatures(mam), 10)

# plot variable features with and without labels
plot1 <- VariableFeaturePlot(mam)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
plot2
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 46 rows containing missing values (geom_point).

I shift the expression of each gene so that the mean is 0 and the variance is 1

all.genes <- rownames(mam)
mam <- ScaleData(mam, features = all.genes)
## Centering and scaling data matrix

Dimensional reduction

I perform PCA on the 2000 most variable genes

mam <- RunPCA(mam, features = VariableFeatures(object = mam))
## PC_ 1 
## Positive:  Sparc, Serpinh1, Ifitm3, Ctsl, Aebp1, Plpp3, Mt1, Col1a2, Cd63-ps, Gsn 
##     Mt2, Dcn, Fstl1, Ifitm2, Igfbp7, Col1a1, Rnase4, Serping1, Cd63, Mmp2 
##     C1s1, Serpinf1, Col6a1, Clec3b, Serpinb6a, Nbl1, Gfpt2, C1ra, Ptrf, Rarres2 
## Negative:  Cd52, Ccr7, Ighm, Cd79a, Ctss, Cd74, H2-Eb1, H2-Aa, Tmsb4x, Trbc2 
##     Igkc, H2-Ab1, H2-DMa, Iglc2, Cd83, Cd79b, Hmgb2, Ms4a1, Iglc3, Mzb1 
##     Trbc1, Epsti1, Traf1, Cd8b1, Cd28, Rgs2, Sh2d2a, Samsn1, Ly6d, Ly86 
## PC_ 2 
## Positive:  Tm4sf1, Tinagl1, F11r, Adgrg1, Fxyd3, Bcam, Hspb1, Cdc42ep3, Cav2, Flt1 
##     Crip2, Epcam, Cdh5, Itga6, Rasip1, Esam, Sfn, Hbegf, Tuba1c, Lcn2 
##     RP23-310J6.1, Krt8, Map2k3, Jup, Sox9, RP23-54F9.1, Cd36, Serpinb5, Efhd2, Tjp2 
## Negative:  Clec3b, Col1a1, Col1a2, Dpt, Dpep1, Igfbp6, Mfap5, Serpinf1, Lum, Col3a1 
##     Efemp1, Dcn, Ifi205, Rarres2, Pcolce2, Loxl1, Tnfaip6, Fn1, Serping1, Htra3 
##     Col6a2, Rnase4, Has1, Islr, Col6a1, Fxyd1, Tnxb, Gfpt2, Mmp2, Ogn 
## PC_ 3 
## Positive:  Cdh5, Flt1, Rasip1, Esam, Cd36, Fabp4, Gpihbp1, Cldn5, Adgrf5, Emcn 
##     Apold1, Pecam1, Grrp1, Cd93, Adgrl4, Iigp1, Mmrn2, Eng, Mgll, Ptprb 
##     Egfl7, Aqp1, C1qtnf9, Cd300lg, Jam2, Id1, Slfn5, Ctla2a, Sdpr, Kdr 
## Negative:  Fxyd3, Sfn, Epcam, Sox9, Lgals7, Serpinb5, Krt8, Lcn2, Perp, Tfap2a 
##     Sdc4, Cdh1, Clca3a2, Gjb4, Krt17, Krt14, Krt5, Fermt1, Aqp3, Cnn1 
##     Lypd3, Pmepa1, Cxcl14, Krt18, Plet1, Gjb3, Cdcp1, Acta2, Serinc2, Sox10 
## PC_ 4 
## Positive:  Prlr, Fam25c, Cldn4, Krt7, Tmem56, Cxcl15, Smim22, Areg, Pgr, Acot1 
##     Slc12a2, Cited1, Dkkl1, Krt19, Tspan1, Mlph, Gipc2, Snhg11, Aqp5, Wfdc2 
##     Wnt7b, Rab25, Cdo1, Basp1, Fxyd2, Cldn3, Cldn7, Lgals3, Slc5a5, F3 
## Negative:  Krt5, Krt14, Krt17, Acta2, Cnn1, Cxcl14, Tpm2, Myh11, Slpi, Palld 
##     Cda, Tagln, Col9a2, Lgals7, Myl9, Fermt1, Col17a1, Id4, Ltbp2, Mylk 
##     Moxd1, Gjb4, Rbp1, RP23-372C7.4, Fst, Chil1, Actg2, Gjb3, AC159092.1, Wnt10a 
## PC_ 5 
## Positive:  Prlr, Trbc2, Fam25c, Cxcl15, Tmem56, Cited1, Snhg11, Krt7, Tspan1, Cldn4 
##     Wfdc2, Prss22, Gipc2, Smim22, Pgr, Cdo1, Acot1, Krt19, Slc12a2, Rab25 
##     Stc2, Areg, Ptn, Rnf208, Wnt7b, Slc5a5, Wnt4, Cldn3, Mtmr7, Ly6c1 
## Negative:  Fcer1g, Tyrobp, Aif1, C1qb, C1qc, Cd68, C1qa, Lyz2, Csf1r, Lilr4b 
##     Ptafr, Mpeg1, Ccl9, Lrrc25, Cd14, Il1b, Bcl2a1a, Lst1, Cxcl16, Spi1 
##     Cd300a, Ms4a7, Mrc1, Ccl6, Dnase1l3, Alox5ap, Clec7a, Adgre1, Clec4a2, Bcl2a1d
print(mam[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1 
## Positive:  Sparc, Serpinh1, Ifitm3, Ctsl, Aebp1 
## Negative:  Cd52, Ccr7, Ighm, Cd79a, Ctss 
## PC_ 2 
## Positive:  Tm4sf1, Tinagl1, F11r, Adgrg1, Fxyd3 
## Negative:  Clec3b, Col1a1, Col1a2, Dpt, Dpep1 
## PC_ 3 
## Positive:  Cdh5, Flt1, Rasip1, Esam, Cd36 
## Negative:  Fxyd3, Sfn, Epcam, Sox9, Lgals7 
## PC_ 4 
## Positive:  Prlr, Fam25c, Cldn4, Krt7, Tmem56 
## Negative:  Krt5, Krt14, Krt17, Acta2, Cnn1 
## PC_ 5 
## Positive:  Prlr, Trbc2, Fam25c, Cxcl15, Tmem56 
## Negative:  Fcer1g, Tyrobp, Aif1, C1qb, C1qc
# visualize the most variable genes
VizDimLoadings(mam, dims = 1:2, reduction = "pca")

VizDimLoadings(mam, dims = 3:4, reduction = "pca")

VizDimLoadings(mam, dims = 4:5, reduction = "pca")

Visualization

Then I visualize the projection of the cells in the first two PCs: (Cells are colored according to cell cycle phase)

DimPlot(mam, reduction = "pca")

The cell cycle seems to be ininfluent!

Select number of PCs

ElbowPlot(mam, ndims= 40)


Choosing how many dimensions to use can vary depending on the method we choose. In general it’s better to keep all PC until 70/75% of the variance is explained

Rule of the thumb:

pc.touse <- (mam$pca@stdev)^2
pc.touse <- pc.touse/sum(pc.touse)
pc.touse <- cumsum(pc.touse)[1:50]
pc.touse <- min(which(pc.touse>=0.85))
pc.touse
## [1] 22

From this the principal components that retain 75-80% of the variance are 22. I chose to do the analysis for 22 PCs and 15 PCs.

Clustering with 15 PC

The first step uses the FindNeighbors function, which constructs a KNN graph based on the euclidean distance in PCA space and refines the edge weights using the Jaccard similarity

mam15 <- FindNeighbors(mam, dims = 1:15)
## Computing nearest neighbor graph
## Computing SNN

To cluster the cell we use the FindClusters function, which uses the Louvain algorithm to iteratively group cells together

mam15 <- FindClusters(mam15, resolution = 0.3)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 3134
## Number of edges: 111730
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9337
## Number of communities: 9
## Elapsed time: 0 seconds
# 9 clusters

TSNE

I plot the clusters using T Stochastic Neighbor Embedding

mam15.tsne <- RunTSNE(mam15, dims=1:15)
DimPlot(mam15.tsne, reduction = "tsne")

UMAP

Uniform Manifold Approximation and Projection is generally preferred but requires the installation of a new package

mam15.UMAP <- RunUMAP(mam15, dims = 1:15)
DimPlot(mam15.UMAP, reduction = "umap")

We can also check whether some of the critial quality paramteres influenced the clustering we got

VlnPlot(mam15.UMAP,features="nCount_RNA")

VlnPlot(mam15.UMAP,features="nFeature_RNA")

VlnPlot(mam15.UMAP,features="percent.mt")

VlnPlot(mam15.UMAP,features="percent.rbp")

or the cell cycle

library(ggplot2)
library(dbplyr)
## 
## Attaching package: 'dbplyr'
## The following objects are masked from 'package:dplyr':
## 
##     ident, sql
mam15@meta.data %>%
  group_by(seurat_clusters,Phase) %>%
  count()  %>% 
  group_by(seurat_clusters) %>%
  mutate(percent=100*n/sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x=seurat_clusters,y=percent, fill=Phase)) +
  geom_col() + 
  ggtitle("Percentage of cell cycle phases per cluster")

For all the other quality check the clustering seems reasonable.

Clustering with 22 PC

mam22 <- FindNeighbors(mam, dims = 1:22)
## Computing nearest neighbor graph
## Computing SNN

To cluster the cell we use the FindClusters function.

mam22 <- FindClusters(mam22, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 3134
## Number of edges: 117528
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8967
## Number of communities: 12
## Elapsed time: 0 seconds
# 12 clusters

TSNE

I plot the clusters using T Stochastic Neighbor Embedding

mam22.tsne <- RunTSNE(mam22, dims=1:22)
DimPlot(mam22.tsne, reduction = "tsne")

UMAP

Uniform Manifold Approximation and Projection is generally preferred but requires the installation of a new package

mam22.UMAP <- RunUMAP(mam22, dims = 1:22)
DimPlot(mam22.UMAP, reduction = "umap")

We can also check whether some of the critical quality parameters influenced the clustering we got

VlnPlot(mam22.UMAP,features="nCount_RNA")

VlnPlot(mam22.UMAP,features="nFeature_RNA")

VlnPlot(mam22.UMAP,features="percent.mt")

VlnPlot(mam22.UMAP,features="percent.rbp")

or the cell cycle

mam22@meta.data %>%
  group_by(seurat_clusters,Phase) %>%
  count()  %>% 
  group_by(seurat_clusters) %>%
  mutate(percent=100*n/sum(n)) %>%
  ungroup() %>%
  ggplot(aes(x=seurat_clusters,y=percent, fill=Phase)) +
  geom_col() + 
  ggtitle("Percentage of cell cycle phases per cluster")

Maybe the quantity of mt RNA in cluster 4 could make the identification of marker genes difficult. The quality check is good also with 22 PCs.

Finding marker genes for 15PCs clustering

Seurat also includes a function that can be used to find genes over expressed between two clusters or overexpressed in one cluster with respect to all the others

The one vs all clusters analyses can be iterated automatically, and we can output the top n (in this case 5) genes for each cluster.

Notice that here they are sorted by logFC - more informative than “p_val_adj”, since a lot of genes will have a FDR close to zero with smallest changes:

mam15 <- mam15.UMAP
mam15.markers <- FindAllMarkers(mam15, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)

mam15.markers %>%
    group_by(cluster) %>%
    slice_max(n = 5, order_by = avg_log2FC)
## # A tibble: 45 × 7
## # Groups:   cluster [9]
##        p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene   
##        <dbl>      <dbl> <dbl> <dbl>     <dbl> <fct>   <chr>  
##  1 0               2.49 0.945 0.216 0         0       Trbc2  
##  2 0               2.31 0.956 0.17  0         0       Cd3d   
##  3 0               2.18 0.746 0.049 0         0       Lef1   
##  4 6.70e-180       2.11 0.589 0.123 1.46e-175 0       Trbc1  
##  5 0               2.09 0.835 0.122 0         0       Trac   
##  6 0               4.91 0.982 0.459 0         1       Igkc   
##  7 0               4.02 0.966 0.056 0         1       Cd79a  
##  8 0               3.73 0.998 0.684 0         1       Cd74   
##  9 0               3.42 0.949 0.104 0         1       H2-DMb2
## 10 0               3.33 0.993 0.413 0         1       H2-Aa  
## # … with 35 more rows

Heatmap

mam15.markers %>%
    group_by(cluster) %>%
    top_n(n = 10, wt = avg_log2FC) -> top10
DoHeatmap(mam15, features = top10$gene) + NoLegend()

Looking at the Heatmap we can see that clusters 0 and 4 show quite some of similarities. Also clusters 5, 7 and 8 are similar.

Going in depth

In real life, having two clusters with “too similar” patterns of DE/marker genes might be a problem. Can we conclude that after all they are the same cell type, partitioned wrongly into two separate clusters, or they are indeed two different cell types or subtypes, with a few genes “making the difference” from one another?

T cells - 0/4

We want to asses the cell type shared by cluster 0 and 4.

c0and4.markers <- FindMarkers(mam15, ident.1 = c(0,4), min.pct = 0.25, test.use = "wilcox")
c0and4.markers <- c0and4.markers[order(-c0and4.markers$avg_log2FC), ] 
head(c0and4.markers, n = 20)
##                 p_val avg_log2FC pct.1 pct.2     p_val_adj
## Trbc2    0.000000e+00   3.237016 0.941 0.152  0.000000e+00
## Cd3d     0.000000e+00   2.851358 0.944 0.104  0.000000e+00
## Cd3g     0.000000e+00   2.745328 0.864 0.062  0.000000e+00
## Trbc1   1.110342e-235   2.639035 0.592 0.079 2.424877e-231
## Trac     0.000000e+00   2.581300 0.817 0.068  0.000000e+00
## Emb      0.000000e+00   2.411268 0.791 0.155  0.000000e+00
## Ms4a4b   0.000000e+00   2.195321 0.680 0.035  0.000000e+00
## Cd3e     0.000000e+00   2.092902 0.743 0.052  0.000000e+00
## Ramp3   5.159768e-120   2.043142 0.426 0.094 1.126842e-115
## Lef1     0.000000e+00   2.041274 0.656 0.038  0.000000e+00
## Lat      0.000000e+00   1.977822 0.736 0.068  0.000000e+00
## Ms4a6b  1.727429e-285   1.860232 0.688 0.088 3.772531e-281
## Cd8b1   2.098191e-163   1.853494 0.430 0.047 4.582239e-159
## Lck      0.000000e+00   1.833052 0.743 0.096  0.000000e+00
## Saraf   3.308645e-256   1.807223 0.884 0.419 7.225751e-252
## Rapgef6 7.997470e-194   1.797912 0.722 0.246 1.746567e-189
## Tcf7    6.236144e-289   1.765309 0.652 0.065 1.361912e-284
## Nkg7    1.619036e-151   1.740390 0.381 0.031 3.535814e-147
## Hcst    3.731547e-283   1.738405 0.773 0.146 8.149326e-279
## Dapl1   4.227755e-142   1.709802 0.335 0.018 9.232995e-138

Cd3d, Cd3g, Ms4a4b, Trbc2 and 1 (t cell receptor beta), Trac (T cell receptor alfa) - Tcell It seems that they are all T cells

Let’s see the genes making the difference between those two clusters:

Genes overexpressed in cluster 0 vs cluster 4

c0vs4.markers <- FindMarkers(mam15, ident.1 = 0, ident.2 = 4, min.pct = 0.25, test.use = "wilcox") 
c0vs4.markers <- c0vs4.markers[order(-c0vs4.markers$avg_log2FC),] 
head(c0vs4.markers, n = 30)
##                      p_val avg_log2FC pct.1 pct.2    p_val_adj
## Lef1          4.935662e-46  1.8435953 0.746 0.171 1.077899e-41
## Dapl1         2.473530e-18  1.6084840 0.389 0.050 5.401942e-14
## Cd8b1         3.957465e-18  1.4953134 0.482 0.149 8.642708e-14
## Igfbp4        9.256773e-21  1.4814241 0.451 0.088 2.021587e-16
## Rapgef6       5.134435e-27  1.4493804 0.778 0.420 1.121309e-22
## Klf2          5.828091e-28  1.3808189 0.875 0.514 1.272797e-23
## Actn1         1.169199e-17  0.9980133 0.442 0.105 2.553413e-13
## Pik3ip1       2.754335e-14  0.9336694 0.394 0.110 6.015193e-10
## RP23-32C18.6  7.361641e-14  0.8791695 0.624 0.343 1.607709e-09
## Hspa1a        5.474583e-07  0.8472905 0.460 0.276 1.195594e-02
## Npc2          1.383523e-23  0.8169287 0.905 0.729 3.021476e-19
## RP24-146B4.2  1.635966e-12  0.8169043 0.741 0.475 3.572787e-08
## Ms4a6b        6.469028e-15  0.7902985 0.733 0.448 1.412771e-10
## Rps19         3.712274e-62  0.7697729 0.994 1.000 8.107236e-58
## Ablim1        4.186792e-11  0.7186184 0.629 0.387 9.143535e-07
## Cd8a          4.002654e-12  0.7166578 0.336 0.077 8.741397e-08
## Satb1         3.713756e-11  0.7122203 0.828 0.630 8.110472e-07
## RP23-460F17.1 9.210091e-37  0.6737760 0.986 0.967 2.011392e-32
## Rpl12         7.031175e-50  0.6635674 0.993 1.000 1.535538e-45
## Rps17         3.947776e-44  0.6505499 0.992 0.989 8.621548e-40
## RP23-426M5.1  1.179923e-14  0.6383055 0.867 0.724 2.576835e-10
## Rpl5          5.528782e-40  0.6376929 0.989 0.978 1.207431e-35
## AC110186.1    5.286241e-16  0.6280161 0.902 0.768 1.154462e-11
## RP23-146N20.3 2.277385e-09  0.6267107 0.495 0.282 4.973581e-05
## Gimap6        1.044335e-08  0.6079117 0.731 0.547 2.280724e-04
## Rpl36a-ps2    7.940824e-20  0.6048760 0.959 0.895 1.734197e-15
## Rpl35a        1.036344e-56  0.6039809 0.996 1.000 2.263272e-52
## Stambpl1      5.554188e-05  0.5997311 0.323 0.177 1.000000e+00
## Rplp0         4.189802e-48  0.5988264 0.996 0.994 9.150108e-44
## RP23-336M12.2 1.756165e-18  0.5972720 0.939 0.867 3.835289e-14

Dapl1 - death associated protein 1 - T cells activates CD8

Lef1 - lymphoid enhancer factor (regulates T cell receptor) - CD8 T cells

Cd8b1 - CD8 antigen, beta chain 1 - CD8 T cells

Rapgef6 - Rap guanine nucleotide exchange factor (GEF) 6 transmission signal - CD8 T cells

Cluster 0: Cytotoxic T CELLS

And overexpressed in cluster 4 vs cluster 0

c4vs0.markers <- FindMarkers(mam15, ident.1 = 4, ident.2 = 0, min.pct = 0.25, test.use = "wilcox") 
c4vs0.markers <- c4vs0.markers[order(-c4vs0.markers$avg_log2FC),] 
head(c4vs0.markers, n = 40)
##                 p_val avg_log2FC pct.1 pct.2    p_val_adj
## Ccl5     9.178670e-17   4.019912 0.475 0.241 2.004530e-12
## Tnfrsf4  1.297757e-43   3.204517 0.431 0.073 2.834171e-39
## Tnfrsf9  5.663093e-75   2.728546 0.453 0.027 1.236763e-70
## S100a6   7.763668e-54   2.702791 0.779 0.319 1.695507e-49
## Odc1     8.694749e-41   2.330583 0.691 0.267 1.898846e-36
## Lgals1   8.657629e-47   2.308617 0.779 0.344 1.890740e-42
## Bcl2a1b  2.809613e-75   2.293395 0.652 0.100 6.135915e-71
## Xcl1     6.719753e-32   2.255040 0.260 0.029 1.467527e-27
## Ctla4    2.009969e-29   2.217114 0.343 0.067 4.389570e-25
## Lgals3   1.778136e-18   2.205171 0.508 0.244 3.883270e-14
## Rgs1     1.583210e-57   2.179747 0.680 0.162 3.457573e-53
## Rora     7.718335e-73   2.173113 0.602 0.085 1.685607e-68
## Il2rb    5.450829e-88   2.162441 0.862 0.206 1.190407e-83
## Icos     3.311547e-41   2.155042 0.674 0.236 7.232087e-37
## Cxcr6    1.156520e-33   2.135533 0.414 0.096 2.525723e-29
## Fcer1g   2.241468e-33   2.129084 0.254 0.025 4.895141e-29
## Bcl2a1d  5.460339e-91   2.119327 0.552 0.036 1.192483e-86
## S100a11  5.471073e-54   1.871773 0.867 0.378 1.194828e-49
## Id2      2.002882e-31   1.839337 0.762 0.404 4.374094e-27
## Rgs2     2.459777e-43   1.838875 0.818 0.384 5.371908e-39
## S100a4   4.182001e-87   1.830910 0.448 0.012 9.133073e-83
## Crip1    6.623109e-22   1.825804 0.856 0.636 1.446421e-17
## Dgat1    6.392839e-38   1.790821 0.586 0.180 1.396132e-33
## Ikzf2    1.610807e-65   1.782292 0.530 0.064 3.517842e-61
## Eprs     1.714339e-26   1.686793 0.497 0.179 3.743944e-22
## Gata3    1.280414e-29   1.685198 0.448 0.119 2.796297e-25
## Ramp3    1.892877e-11   1.654939 0.580 0.397 4.133853e-07
## Capg     1.788797e-52   1.648506 0.586 0.125 3.906554e-48
## Anxa2    4.719188e-48   1.626626 0.564 0.122 1.030623e-43
## Stx11    1.225240e-61   1.610300 0.398 0.027 2.675802e-57
## Ccr2     3.008786e-55   1.563099 0.304 0.011 6.570889e-51
## Ikzf3    2.182000e-48   1.555595 0.436 0.064 4.765270e-44
## Cd74     2.278393e-06   1.538211 0.729 0.621 4.975782e-02
## Cstb     7.818135e-18   1.532447 0.586 0.307 1.707403e-13
## Tnfrsf18 1.963210e-27   1.531602 0.657 0.313 4.287454e-23
## Lmna     7.664273e-34   1.498586 0.608 0.212 1.673801e-29
## Crem     1.039997e-29   1.490932 0.851 0.522 2.271249e-25
## H2afz    1.502953e-27   1.474332 0.729 0.412 3.282299e-23
## Cd7      3.707604e-17   1.469328 0.365 0.134 8.097036e-13
## Lmo4     5.000408e-11   1.456491 0.265 0.103 1.092039e-06

Ccl5 - chemokine ligand - all the cells (overexpressed in activated T)

Tnfrsf4 and 9 - TNF receptor superfamily (citokyne receptor) - 4 overexpressed in T cells

Il2rb - expressed mainly in T reg cells

Icos - inducible T cell co-stimulator or Il2rb interleukin 2 receptor for T reg

Ctla4 - citotoxic T cell

Cluster 4: Regulatory T CELLS

c4.markers <- FindMarkers(mam15, ident.1 = 4, min.pct = 0.25, test.use = "wilcox")
c4.markers<- c4.markers[order(-c4.markers$avg_log2FC),] 
head(c4.markers, n = 20)
##                  p_val avg_log2FC pct.1 pct.2     p_val_adj
## Tnfrsf4   1.688780e-82   3.386692 0.431 0.055  3.688127e-78
## Ccl5      2.410088e-18   2.963408 0.475 0.252  5.263391e-14
## Tnfrsf9  1.168141e-148   2.757913 0.453 0.026 2.551104e-144
## Icos     3.534140e-129   2.642139 0.674 0.091 7.718209e-125
## Il2rb    2.950531e-220   2.551248 0.862 0.087 6.443664e-216
## Ramp3     5.388275e-45   2.526703 0.580 0.194  1.176745e-40
## Ctla4     1.524957e-77   2.390548 0.343 0.034  3.330353e-73
## Xcl1      1.796225e-91   2.332211 0.260 0.012  3.922775e-87
## Cxcr6     1.212125e-97   2.296950 0.414 0.040  2.647160e-93
## Odc1      1.549042e-38   2.194837 0.691 0.334  3.382953e-34
## Rgs1      6.155512e-97   2.117851 0.680 0.124  1.344302e-92
## Rora      8.504234e-82   2.085887 0.602 0.124  1.857240e-77
## Nkg7      1.589384e-26   2.068852 0.414 0.144  3.471056e-22
## Tnfrsf18  5.158250e-81   2.057356 0.657 0.147  1.126510e-76
## Dgat1     2.401817e-71   1.963953 0.586 0.128  5.245329e-67
## Ikzf2    7.169379e-160   1.909731 0.530 0.035 1.565721e-155
## Id2       7.789812e-47   1.882657 0.762 0.328  1.701217e-42
## Rgs2      8.197631e-55   1.774368 0.818 0.343  1.790281e-50
## Gata3     1.122760e-48   1.746248 0.448 0.099  2.451995e-44
## Bcl2a1b   3.917433e-68   1.746028 0.652 0.154  8.555282e-64

Macrophages - 5

Cluster 5, 7 and 8

c5and6and7.markers <- FindMarkers(mam15, ident.1 = c(5,6,7), min.pct = 0.25, test.use = "wilcox")
c5and6and7.markers <- c5and6and7.markers[order(-c5and6and7.markers$avg_log2FC), ] 
head(c5and6and7.markers, n = 10)
##                p_val avg_log2FC pct.1 pct.2     p_val_adj
## Krt14   2.577047e-80   4.148111 0.426 0.092  5.628013e-76
## Lyz2    2.311256e-98   4.124770 0.372 0.049  5.047553e-94
## Krt17   7.788089e-81   4.013333 0.435 0.093  1.700841e-76
## Plet1   1.075669e-83   3.932679 0.599 0.180  2.349153e-79
## Lcn2   1.517368e-123   3.619132 0.645 0.158 3.313779e-119
## Slpi   1.455057e-104   3.562847 0.457 0.076 3.177698e-100
## Sfn     4.343352e-77   3.483441 0.648 0.253  9.485447e-73
## Acta2   1.805194e-80   3.408883 0.389 0.073  3.942364e-76
## Cxcl14  3.513107e-72   3.119368 0.386 0.082  7.672274e-68
## Epcam  3.968868e-122   2.987532 0.625 0.142 8.667610e-118

Krt14 and Krt17 basal cells, Lyz2 macrophages, Plet1 al biological processes? - basal cells luminal epitelial, Slpi basal cells, Sfn basal and luminal epitelial, Acta2 and Cxcl14 basal

seem to be basal cell, but I need to further assess the gene expression

Cluster 5

c5vs78.markers <- FindMarkers(mam15, ident.1 = 5, ident.2 = c(7,8), min.pct = 0.25, test.use = "wilcox") 
c5vs78.markers <- c5vs78.markers[order(-c5vs78.markers$avg_log2FC),] 
head(c5vs78.markers , n = 10)
##               p_val avg_log2FC pct.1 pct.2    p_val_adj
## Lyz2   2.593171e-29   5.268751 0.759 0.181 5.663226e-25
## Ccl4   7.953952e-11   4.415573 0.447 0.141 1.737064e-06
## H2-Ab1 7.664238e-23   3.764166 0.844 0.752 1.673793e-18
## Apoe   7.583332e-07   3.628451 0.723 0.671 1.656124e-02
## C1qb   4.368295e-13   3.518512 0.404 0.067 9.539919e-09
## Ccl7   9.815155e-03   3.438505 0.426 0.376 1.000000e+00
## C1qa   4.658022e-13   3.436493 0.397 0.060 1.017265e-08
## Cxcl2  2.153981e-04   3.408237 0.433 0.302 1.000000e+00
## Il1b   3.461477e-12   3.282038 0.447 0.107 7.559519e-08
## Tyrobp 1.627813e-28   3.064122 0.872 0.315 3.554980e-24

Lyz2, Ccl4, C1qb - ONLY expressed in macrophages

H2-Ab1 - b cells and macrophages

Cluster 5 vs ALL: Lyz2 -> produces Lyzozyme which has bacteriolytic function, contained in macrophages

Ccl4, IL-1β expressed in a wide range of cells, especially in macrophages

MACROPHAGES

Luminal epithelial - 7/8

Cluster 7

c7vs8.markers <- FindMarkers(mam15, ident.1 = 7, ident.2 = 8, min.pct = 0.25, test.use = "wilcox") 
c7vs8.markers <- c7vs8.markers[order(-c7vs8.markers$avg_log2FC),] 
head(c7vs8.markers , n = 10)
##                p_val avg_log2FC pct.1 pct.2    p_val_adj
## Ccl5    7.896022e-03   5.651256 0.579 0.556 1.000000e+00
## Lcn2    9.588262e-12   4.104435 0.768 0.333 2.093981e-07
## Wfdc18  2.419885e-10   4.095119 0.653 0.167 5.284788e-06
## Csn3    1.660745e-09   4.071832 0.600 0.130 3.626901e-05
## Cst3    2.644240e-18   4.066987 0.947 0.648 5.774756e-14
## Epsti1  2.175447e-05   3.971046 0.411 0.111 4.750958e-01
## Plet1   1.667773e-06   3.579314 0.821 0.741 3.642250e-02
## Vim     9.131721e-04   3.379894 0.526 0.352 1.000000e+00
## Aldh1a3 4.932237e-10   3.236404 0.589 0.074 1.077151e-05
## Lsp1    3.269450e-03   3.162157 0.400 0.241 1.000000e+00

Csn3 - luminal epithelial (caseina) - milk gene

Lcn2 lipocalin 2 - luminal progenitor

Plet1 - luminal cells

Wfdc18 - WAP four-disulfide core domain 18 - milk gene in the luminal progenitor

Aldh1a3 - progenitor luminal marker (from article)

Cluster 7 vs ALL: Csn3 produces caseine in luminal epithelial cells, Wfdc18 WAP is produced selectively by committed luminal cells within mammary ducts and alveoli

LUMINAL EPITHELIAL PROGENITOR

Cluster 8

c8vs7.markers <- FindMarkers(mam15, ident.1 = 8, ident.2 = 7, min.pct = 0.25, test.use = "wilcox") 
c8vs7.markers<- c8vs7.markers[order(-c8vs7.markers$avg_log2FC),] 
head(c8vs7.markers , n = 20)
##                      p_val avg_log2FC pct.1 pct.2    p_val_adj
## Ptn           2.150207e-26   5.127102 0.981 0.211 4.695838e-22
## Fam25c        4.283342e-28   4.152706 0.963 0.063 9.354391e-24
## Areg          3.516379e-21   4.119608 0.889 0.211 7.679420e-17
## Gpx3          1.417817e-25   4.072971 0.944 0.126 3.096370e-21
## Wfdc2         1.562589e-25   3.786876 0.981 0.211 3.412539e-21
## Ly6a          2.395399e-23   3.534809 0.963 0.295 5.231312e-19
## Fxyd2         1.196481e-10   3.485120 0.796 0.358 2.612996e-06
## Nupr1         1.409345e-18   3.403162 0.963 0.579 3.077868e-14
## Prlr          3.688473e-27   3.357316 0.963 0.116 8.055256e-23
## Slc12a2       5.449938e-20   3.129085 0.852 0.179 1.190212e-15
## Ly6d          5.553404e-19   3.072081 0.981 0.589 1.212808e-14
## Cd164         2.357259e-22   3.016621 0.963 0.505 5.148019e-18
## Tgm2          5.827858e-21   3.013323 0.870 0.179 1.272746e-16
## Glul          1.024538e-22   2.965690 1.000 0.484 2.237488e-18
## Cited1        3.918453e-20   2.857229 0.833 0.126 8.557510e-16
## RP23-396L12.1 2.262669e-13   2.852974 0.519 0.021 4.941442e-09
## F3            1.177899e-16   2.585397 0.944 0.547 2.572413e-12
## Gadd45g       2.268068e-16   2.509278 0.907 0.400 4.953234e-12
## Mt1           5.901599e-15   2.504591 0.981 0.916 1.288850e-10
## Acot1         1.011265e-20   2.248728 0.833 0.116 2.208502e-16

Ptn - pleiotropin, expressed in epithelial cells of the mammary gland

Amphiregulin (AREG), a ligand for epidermal growth factor receptor, is required for mammary gland ductal morphogenesis - found in mature luminal

Ly6a marker for mature luminal cells

Prlr - prolactine receptor

cluster8.markers <- FindMarkers(mam15, ident.1 = 8, min.pct = 0.25, test.use = "wilcox")
cluster8.markers<- cluster8.markers[order(-cluster8.markers$avg_log2FC),] 
head(cluster8.markers, n = 20)
##                       p_val avg_log2FC pct.1 pct.2     p_val_adj
## Ptn            1.638583e-88   4.966034 0.981 0.133  3.578500e-84
## Krt19          2.771448e-57   4.664426 1.000 0.257  6.052566e-53
## Krt18          1.040501e-54   4.199693 1.000 0.264  2.272349e-50
## Areg           2.507864e-99   4.188402 0.889 0.085  5.476924e-95
## Fam25c        6.338078e-236   4.144952 0.963 0.031 1.384173e-231
## Fxyd2          9.561550e-85   4.089939 0.796 0.073  2.088147e-80
## Krt8           1.632043e-67   3.908918 1.000 0.190  3.564218e-63
## Nupr1          4.767968e-50   3.747800 0.963 0.244  1.041277e-45
## Wfdc2         2.931051e-109   3.746610 0.981 0.097 6.401123e-105
## F3             1.189587e-83   3.729408 0.944 0.123  2.597939e-79
## Prlr          5.083734e-242   3.419781 0.963 0.030 1.110237e-237
## Slc12a2       4.102617e-129   3.262834 0.852 0.054 8.959706e-125
## Gpx3           1.024273e-61   3.212295 0.944 0.163  2.236910e-57
## Ly6d           4.641992e-36   3.185266 0.981 0.403  1.013765e-31
## Krt7          7.100285e-152   3.132990 1.000 0.065 1.550631e-147
## S100a6         1.686300e-34   3.087429 1.000 0.515  3.682710e-30
## Cldn4         4.966306e-154   3.029716 0.889 0.046 1.084592e-149
## Cited1        1.144805e-175   2.934301 0.833 0.033 2.500140e-171
## RP23-396L12.1 5.229750e-131   2.821598 0.519 0.015 1.142125e-126
## Bhlhe41        6.679118e-84   2.817685 0.852 0.086  1.458653e-79

Areg, Prlr prolactin receptor

Pgr progesterone receptor - Mature luminal (from tabula muris article)

MATURE LUMINAL

B cells - 1

Cluster 1 vs all

cluster1.markers <- FindMarkers(mam15, ident.1 = 1, min.pct = 0.25, test.use = "wilcox")
cluster1.markers<- cluster1.markers[order(-cluster1.markers$avg_log2FC),] 
head(cluster1.markers, n = 20)
##                 p_val avg_log2FC pct.1 pct.2     p_val_adj
## Igkc     0.000000e+00   4.909842 0.982 0.459  0.000000e+00
## Cd79a    0.000000e+00   4.018468 0.966 0.056  0.000000e+00
## Cd74     0.000000e+00   3.730391 0.998 0.684  0.000000e+00
## H2-DMb2  0.000000e+00   3.419685 0.949 0.104  0.000000e+00
## H2-Aa    0.000000e+00   3.325257 0.993 0.413  0.000000e+00
## H2-Ab1   0.000000e+00   3.291506 0.991 0.485  0.000000e+00
## Iglc2    0.000000e+00   3.149693 0.801 0.093  0.000000e+00
## Ighm     0.000000e+00   3.062614 0.959 0.371  0.000000e+00
## H2-Eb1   0.000000e+00   3.016879 0.991 0.388  0.000000e+00
## Ebf1     0.000000e+00   2.845617 0.981 0.257  0.000000e+00
## Cd79b    0.000000e+00   2.634404 0.721 0.081  0.000000e+00
## Ighd     0.000000e+00   2.563363 0.765 0.058  0.000000e+00
## Fcmr     0.000000e+00   2.484154 0.694 0.049  0.000000e+00
## Mef2c    0.000000e+00   2.462174 0.763 0.103  0.000000e+00
## Ms4a1    0.000000e+00   2.457968 0.638 0.028  0.000000e+00
## Cd83    2.117549e-278   2.412450 0.758 0.149 4.624516e-274
## Iglc3   1.163791e-293   2.335728 0.623 0.047 2.541603e-289
## H2-Ob    0.000000e+00   2.261084 0.761 0.090  0.000000e+00
## Scd1    1.651398e-305   2.216370 0.752 0.127 3.606487e-301
## Cd37    3.900746e-273   2.106576 0.883 0.373 8.518840e-269

Igkc immunoglobulin kappa constant - B cells

Cd74 - B cells and macrophages

Cd79a - B cells creates Ig alpha

Cd79b - B cells creates Ig beta

H2-.. - histocompatibility complex 2, found on B cells

Stromal cells - 2

cluster2.markers <- FindMarkers(mam15, ident.1 = 2, min.pct = 0.25, test.use = "wilcox")
cluster2.markers<- cluster2.markers[order(-cluster2.markers$avg_log2FC),] 
head(cluster2.markers, n = 20)
##                  p_val avg_log2FC pct.1 pct.2     p_val_adj
## Dcn      2.011460e-306   5.964926 0.965 0.220 4.392827e-302
## Gsn      1.501882e-252   5.337764 0.998 0.448 3.279960e-248
## Tnfaip6   0.000000e+00   5.148355 0.940 0.164  0.000000e+00
## Col3a1    0.000000e+00   4.929044 0.878 0.075  0.000000e+00
## Col1a2    0.000000e+00   4.539514 0.940 0.038  0.000000e+00
## Col1a1    0.000000e+00   4.433464 0.918 0.026  0.000000e+00
## Clec3b    0.000000e+00   4.383540 0.876 0.056  0.000000e+00
## Ccl2     4.182943e-137   4.166515 0.707 0.214 9.135129e-133
## Cxcl1    1.130420e-145   4.053782 0.792 0.266 2.468724e-141
## Igfbp6    0.000000e+00   4.008005 0.896 0.107  0.000000e+00
## Plac9c    0.000000e+00   3.978710 0.784 0.075  0.000000e+00
## Lum       0.000000e+00   3.902518 0.881 0.064  0.000000e+00
## Dpt       0.000000e+00   3.727592 0.841 0.048  0.000000e+00
## Fn1       0.000000e+00   3.673571 0.739 0.061  0.000000e+00
## Postn    1.366508e-271   3.635399 0.767 0.089 2.984317e-267
## Serpinf1  0.000000e+00   3.631132 0.888 0.057  0.000000e+00
## Ifi205    0.000000e+00   3.596225 0.849 0.040  0.000000e+00
## Ptx3     8.410029e-161   3.540867 0.618 0.113 1.836666e-156
## Mmp3     4.404092e-203   3.475974 0.566 0.055 9.618097e-199
## Ccl11     0.000000e+00   3.472359 0.784 0.039  0.000000e+00

Dcn decorin (proteoglycan) - Stromal cells

Gsn gelsolina (actin binding protein) - Stromal cells

Tnfaip6 - stromal cells

Col3a1 collagene type III - Stromal cells

Col1 collagen type II - Stromal cells

Endotelial cells - 3

cluster3.markers <- FindMarkers(mam15, ident.1 = 3, min.pct = 0.25, test.use = "wilcox")
cluster3.markers<- cluster3.markers[order(-cluster3.markers$avg_log2FC),] 
head(cluster3.markers, n = 20)
##                 p_val avg_log2FC pct.1 pct.2     p_val_adj
## Fabp4   1.566507e-152   6.390228 0.815 0.225 3.421095e-148
## Cldn5   9.642696e-211   4.474197 0.555 0.035 2.105868e-206
## Aqp1    2.007816e-134   4.215125 0.509 0.062 4.384870e-130
## Id1     5.991903e-134   3.883379 0.566 0.083 1.308572e-129
## Tm4sf1  2.165186e-110   3.724753 0.755 0.247 4.728551e-106
## Cd36    1.526337e-187   3.673419 0.596 0.058 3.333367e-183
## Iigp1   9.138218e-145   3.380963 0.615 0.091 1.995696e-140
## Ly6c1   1.356899e-107   3.191548 0.653 0.162 2.963331e-103
## Gpihbp1 1.873827e-275   3.175703 0.509 0.009 4.092250e-271
## Ctla2a   8.744844e-98   3.135045 0.615 0.154  1.909786e-93
## Flt1     0.000000e+00   2.957174 0.604 0.012  0.000000e+00
## Rasip1   0.000000e+00   2.951594 0.642 0.018  0.000000e+00
## Ifitm3   8.971972e-96   2.919977 0.830 0.436  1.959389e-91
## Isg15    1.026771e-96   2.860699 0.547 0.113  2.242365e-92
## Rnd1     1.006319e-99   2.775785 0.396 0.048  2.197699e-95
## Ier3     5.608218e-80   2.758546 0.675 0.238  1.224779e-75
## Akap12  3.833581e-113   2.750555 0.502 0.075 8.372159e-109
## Emp1     1.013209e-52   2.745602 0.630 0.287  2.212747e-48
## Cdkn1a   1.096357e-87   2.740566 0.717 0.278  2.394335e-83
## Esam    3.950912e-301   2.721911 0.608 0.018 8.628397e-297

Fabp4 fatty acid biding protein - endotelial cells

Cldn5 endothelial thight junctions - endotelial cells

Aqp1 aquaporin 1 - endotelial cells

Cd36 lipid transport - endotelial cells

Esam - endotelial cell specific adhesion protein

Basal cells - 6

Cluster 6

c6vs8.markers <- FindMarkers(mam15, ident.1 = 6, ident.2 = 8, min.pct = 0.25, test.use = "wilcox") 
c6vs8.markers <- c6vs8.markers[order(-c6vs8.markers$avg_log2FC),] 
head(c6vs8.markers , n = 10)
##               p_val avg_log2FC pct.1 pct.2    p_val_adj
## Krt14  1.064588e-25   5.809747 0.991 0.167 2.324954e-21
## Krt17  6.728689e-26   5.591590 1.000 0.296 1.469478e-21
## Acta2  1.855855e-25   5.261605 0.991 0.259 4.053002e-21
## Slpi   9.494052e-25   5.052529 0.983 0.148 2.073406e-20
## Cxcl14 8.813886e-26   4.969043 0.991 0.130 1.924865e-21
## Apoe   1.171922e-25   4.768597 1.000 0.611 2.559360e-21
## Lcn2   3.024460e-25   4.703949 0.991 0.333 6.605118e-21
## Tpm2   4.782964e-26   4.459295 1.000 0.241 1.044551e-21
## Chil1  6.708454e-19   3.814005 0.828 0.056 1.465059e-14
## Fst    7.451984e-23   3.572644 0.922 0.056 1.627439e-18

Krt14 and Krt17 marker for basal cells, also Acta2

BASAL CELLS

Visualizing marker genes

Plot the expression of the markers with a heatmap

FeaturePlot(mam15, features = c("Trbc2", "Cd79a", "Dcn", "Fabp4", "Lyz2", "Krt14", "Plet1", "Areg"))

I inspect in depth the subpopulations of T cells:

FeaturePlot(mam15, features = "Il2rb", repel = T) 

FeaturePlot(mam15, features = "Cd8b1", repel = T) 

Or with a violin plot

VlnPlot(mam15, features = "Cd8b1")

VlnPlot(mam15, features = "Cd79a")

VlnPlot(mam15, features = "Dcn")

VlnPlot(mam15, features = "Fabp4")

VlnPlot(mam15, features = "Il2rb")

VlnPlot(mam15, features = "Lyz2")

VlnPlot(mam15, features = "Krt14")

VlnPlot(mam15, features = "Plet1")

VlnPlot(mam15, features = "Areg")

We can visualize all the marker genes and their expression (CPM) in each cluster using a dot plot.

library(ggrepel)
DotPlot(mam15, features = c("Trbc2", "Cd79a", "Dcn", "Fabp4", "Il2rb", "Lyz2", "Krt14", "Plet1", "Areg")) + theme(axis.text.x = element_text(angle = 90))

Dot plot with Cd8b1 gene marker for cluster 0 instead of Trbc2:

library(ggrepel)
DotPlot(mam15, features = c("Cd8b1", "Cd79a", "Dcn", "Fabp4", "Il2rb", "Lyz2", "Krt14", "Plet1", "Areg")) + theme(axis.text.x = element_text(angle = 90))

Both cluster 0 and 4 are T cells, but it seems that only cluster 0 contains CD8+ T cells.

Final Result

Using marker genes to infere the subtypes of each cluster was quite difficult and needed a deep knowlegde of the fild (which in our case is limited). Eventually we were able to identify 10 different cells types of which 4 subtypes of ODC, 2 subtypes of AST and Microglia, Neuros, OPC ad Endothelial cells.

Comparing the results with the automated pipeline employed by Panglao, we were able to label and cluster the cells with unknown cell type. Also we did not found Pancreatich stellate cells and Pericytes, may be due to different clustering parameters.

new.cluster.ids <- c("T cytotoxic", "B cells", "Stromal cells", "Endotelial cells", "T reg", "Macrophages", "Basal cells", "Luminal Progenitor ", 'Mature Luminal') 
names(new.cluster.ids) <- levels(mam15) 
MAM15<- RenameIdents(mam15, new.cluster.ids) 
DimPlot(MAM15, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()

DimPlot(mam15.tsne, reduction = “tsne”)

names(new.cluster.ids) <- levels(mam15.tsne) 
MAM15.TSNE<- RenameIdents(mam15.tsne, new.cluster.ids) 
DimPlot(MAM15.TSNE, reduction = "tsne", label = TRUE, pt.size = 0.5) + NoLegend()